1 Generate Data 1d

Let \(X \sim \sum_i^k \pi_i \mathcal{N}_i(\mu_i, \sigma^2)\). With \(k = 10\) and \(n = 1000\), \(N = nk\), \(\pi_i = 1/n\).

n <- 2e3
k <- 10
N <- n*k

set.seed(53)
mux <- c(seq(1,5,by=1),8, 10,15, 20, 25)

mu0 <- mux
(sigma1 <- rep(1,10))
#  [1] 1 1 1 1 1 1 1 1 1 1
Z <- rep(1:k, each = n)

Here are the means

plot(mu0, col = 1:10, pch = 19, cex = 3)

We now generate the data with

X0 <- data.frame(x1 = sapply(Z, function(x) rnorm(1, mu0[x], sigma1[x])), Z = Z)
                 

head(X0[sample(dim(X0)[1]),])
#              x1  Z
# 12664  9.943007  7
# 12617  8.381332  7
# 6000   2.230395  3
# 11073  5.135620  6
# 18730 24.113318 10
# 18019 25.842470 10
dat <- X0[, 1]

And here are the data colored with the truth.

ggCol <- c('#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f','#ff7f00','#cab2d6','#6a3d9a')
ggplot(data = X0, aes(x=x1, y = Z, ,color = as.factor(Z))) +
  scale_color_manual(values = ggCol) + 
  geom_point(alpha = 0.5)

2 Run algorithms

2.1 Run \(jk\)-means

Run with \(jk\)-means with \(j=2, k=10\):

options: max iteration=100 convergence error tolerance = 1E-8

K <- k
j <- 2

set.seed(2^10)
system.time(
  jk <- jkmeans::jkmeansEM(as.matrix(dat), K, j , 100, tol = 1E-8)
)
jkv <- apply(jk$zeta, 1,  function(x) which( x == max(x) ))

2.2 Run \(k=10\)-means initialized at the truth

system.time({
  kv <- kmeans(dat, centers = mu0)
}
)
#    user  system elapsed 
#   0.012   0.000   0.012

2.3 Run MCLUST

system.time(
  mcv <- Mclust(dat, 
              G = K, 
              modelNames="E")
)
#    user  system elapsed 
#   0.240   0.012   0.252
mcv
# 'Mclust' model object:
#  best model: univariate, equal variance (E) with 10 components

3 Results

#            Names       ari
# 1     Truth v km 0.5311323
# 2     Truth v jk 0.5494590
# 3 Truth v mclust 0.5613259
Y <- data.frame(X0, kv = kv$cluster,
                mc=mcv$classification,
                jk=jkv)

p1d <- ggplot(data = Y, aes(x=x1, y=Z)) +
  scale_color_manual(values = ggCol) 

4 Generate Data 2d

Let \(X \sim \sum_i^k \pi_i \mathcal{N}_i(\mu_i, \sigma^2_i I)\). With \(k = 10\) and \(n = 1000\), \(N = nk\).

n <- 1e3
k <- 10
N <- n*k

set.seed(49)
mux <- sort(sample(-20:20, 10))
muy <- sample(-20:20, 10)

(mu0 <- cbind(mux, muy))
#       mux muy
#  [1,] -19 -10
#  [2,] -17  -3
#  [3,] -11 -16
#  [4,] -10   5
#  [5,]  -6  16
#  [6,]  -3 -20
#  [7,]  -1   2
#  [8,]   3 -13
#  [9,]   6  -5
# [10,]  18  -6
(sigma1 <- sample(c(1,1,1,2,3,4,5), 10, replace=TRUE))
#  [1] 1 4 2 4 1 5 5 1 5 1
sigma0 <- lapply(sigma1, function(x) matrix(x*c(1,0,0,1), 2, 2))

Z <- rep(1:k, each = n)

Here are the means

plot(mu0, col = 1:10, pch = 19, cex = 3)

We now generate the data with

X0 <- data.frame(t(sapply(Z, function(x) rmvnorm(1, mu0[x,], sigma0[[x]]))), Z)
names(X0) <- c('x1', 'x2', 'Z')

head(X0[sample(dim(X0)[1]),])
#              x1          x2 Z
# 5564  -1.391007 -20.7115938 6
# 3350 -11.039710   4.7961061 4
# 1000 -19.664400  -9.6635771 1
# 8542   5.388799  -0.2730803 9
# 1467 -13.249741   1.0983979 2
# 3507 -11.265466   2.7168030 4
dat <- as.matrix(X0[, 1:2])

And here are the data colored with the truth.

ggCol <- c('#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f','#ff7f00','#cab2d6','#6a3d9a')
ggplot(data = X0, aes(x=x1, y=x2, color = as.factor(Z))) +
  scale_color_manual(values = ggCol) + 
  geom_point(alpha = 0.5)

5 Run algorithms

5.1 Run \(jk\)-means

Run with \(jk\)-means with \(j=2, k=10\):

options: max iteration=100 convergence error tolerance = 1E-8

K <- k
j <- 2

set.seed(5431)
set.seed(2^10)
system.time(
  jk <- jkmeans::jkmeansEM(dat, K, j , 100, tol = 1E-8)
)
#    user  system elapsed 
#   0.693   0.004   0.691
jkv <- apply(jk$zeta, 1,  function(x) which( x == max(x) ))

5.2 Run \(k=10\)-means initialized at the truth

system.time({
  kv <- kmeans(dat, centers = mu0)
}
)
#    user  system elapsed 
#   0.002   0.000   0.002

5.3 Run MCLUST

system.time(
  mcv <- Mclust(dat, 
              G = K, 
              modelNames="EII")
)
#    user  system elapsed 
# 171.500   0.697 172.218
mcv
# 'Mclust' model object:
#  best model: spherical, equal volume (EII) with 10 components

6 Results

#            Names       ari
# 1     Truth v km 0.9579922
# 2     Truth v jk 0.8206772
# 3 Truth v mclust 0.9563053
Y <- data.frame(X0, jk=jkv, kv = kv$cluster,
                mc=mcv$classification)

p1 <- ggplot(data = Y, aes(x=x1, y=x2)) +
  scale_color_manual(values = ggCol)